Clean data
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$status <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
Descriptives
# Breakdown of passengers by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(gender)) %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100)
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: class [3]
## class gender count percent
## <fct> <fct> <int> <dbl>
## 1 1st Class Female 153 43.7
## 2 1st Class Male 197 56.3
## 3 2nd Class Female 112 38.4
## 4 2nd Class Male 180 61.6
## 5 3rd Class Female 216 30.5
## 6 3rd Class Male 493 69.5
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent))
## # A tibble: 44 x 3
## nationality2 count percent
## <chr> <int> <dbl>
## 1 English 1037 42.4
## 2 Irish 361 14.7
## 3 American 246 10.0
## 4 Other - Multiple 116 4.74
## 5 Swedish 99 4.04
## 6 Syrian/Lebanese 86 3.51
## 7 Finnish 58 2.37
## 8 Scottish 49 2.00
## 9 French 44 1.80
## 10 Italian 41 1.67
## # ... with 34 more rows
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent))
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 79 x 4
## # Groups: class [3]
## class nationality2 count percent
## <fct> <chr> <int> <dbl>
## 1 1st Class American 195 57.4
## 2 1st Class English 51 15
## 3 1st Class Canadian 27 7.94
## 4 1st Class Other - Multiple 14 4.12
## 5 1st Class French 10 2.94
## 6 1st Class Irish 6 1.76
## 7 1st Class Swiss 6 1.76
## 8 1st Class German 5 1.47
## 9 1st Class Scottish 5 1.47
## 10 1st Class Spanish 4 1.18
## # ... with 69 more rows
# Breakdown of passenger nationalities by class (>= 5%)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
filter(percent >= 5) %>%
arrange(class, desc(percent))
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 12 x 4
## # Groups: class [3]
## class nationality2 count percent
## <fct> <chr> <int> <dbl>
## 1 1st Class American 195 57.4
## 2 1st Class English 51 15
## 3 1st Class Canadian 27 7.94
## 4 2nd Class English 145 51.1
## 5 2nd Class Other - Multiple 25 8.80
## 6 2nd Class American 24 8.45
## 7 3rd Class English 112 15.8
## 8 3rd Class Irish 105 14.8
## 9 3rd Class Swedish 89 12.6
## 10 3rd Class Syrian/Lebanese 83 11.7
## 11 3rd Class Other - Multiple 69 9.73
## 12 3rd Class Finnish 52 7.33
# Average age by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
group_by(class) %>%
summarize(avg_age = mean(age), min_age = min(age), max_age = max(age))
## # A tibble: 3 x 4
## class avg_age min_age max_age
## <fct> <dbl> <int> <int>
## 1 1st Class 39.1 0 71
## 2 2nd Class 30.0 0 71
## 3 3rd Class 25.1 0 74
# Survival rate by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived)
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: class [3]
## class survived count percent
## <fct> <chr> <int> <dbl>
## 1 1st Class Lost 123 38.0
## 2 1st Class Saved 201 62.0
## 3 2nd Class Lost 166 58.5
## 4 2nd Class Saved 118 41.5
## 5 3rd Class Lost 528 74.5
## 6 3rd Class Saved 181 25.5
# Survival rate by gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived)
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 4 x 4
## # Groups: gender [2]
## gender survived count percent
## <fct> <chr> <int> <dbl>
## 1 Female Lost 127 27.3
## 2 Female Saved 339 72.7
## 3 Male Lost 690 81.1
## 4 Male Saved 161 18.9
# Survival rate by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender)
## `summarise()` has grouped output by 'class', 'gender'. You can override using the `.groups` argument.
## # A tibble: 12 x 5
## # Groups: class, gender [6]
## class gender survived count percent
## <fct> <fct> <chr> <int> <dbl>
## 1 1st Class Female Lost 5 3.47
## 2 1st Class Female Saved 139 96.5
## 3 1st Class Male Lost 118 65.6
## 4 1st Class Male Saved 62 34.4
## 5 2nd Class Female Lost 12 11.3
## 6 2nd Class Female Saved 94 88.7
## 7 2nd Class Male Lost 154 86.5
## 8 2nd Class Male Saved 24 13.5
## 9 3rd Class Female Lost 110 50.9
## 10 3rd Class Female Saved 106 49.1
## 11 3rd Class Male Lost 418 84.8
## 12 3rd Class Male Saved 75 15.2
Density ridges
surv_classhist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.69

surv_agehist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.88

surv_ageclass_hist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 6.54
## Picking joint bandwidth of 5.57
## Picking joint bandwidth of 2.96
